1 Foreword

This vignette is the fourth procuded during my internship in TIMC-IMAG lab (BCM team). It is divided into two parts. The first one is the visualization of transcription along with heatmap of selected_genes closests DMRs ( window = 100k). The second one offer a new physical gene partition to analyze methylation.

2 Genes DMR methylation

To obtain a satisfying view of genes DMRs, we firstly index probes per DMRs and then per genes based on a distance treshold, fixed here at 100k bp, giving a large window allowing the script to catch away DMRs.

targeted_genes <- penda_superup_deregulated
features <- get_features(targeted_genes, study = trscr_lusc, up_str = 7500, dwn_str = 7500)
get_indexed_DMRs <- function(DMRtable = DMR_table,
                             features_list = features,
                             regions_coordinates = platform,
                             treshold = 100000){
  
  DMRtable<- DMRtable[order(DMRtable[,"DMR_id"]),]
  
  print("Extractig regions coordinates per available genes...")
  
  pf<- lapply(rownames(features_list), function(gene){
    regions_coordinates[which(regions_coordinates[,"gene"]==gene),]
  })
  names(pf)<- rownames(features_list)
  
  
  
  print("Indexing DMRs per genes...")
  
  feat_indexed_DMRs <- epimedtools::monitored_apply(t(t(rownames(features_list))),mod=10,1, function(gene){
    
    print(noquote(paste0("Indexing DMRs for ", gene ,"...")))
    
    DMRs_of_interest <- DMRtable[which(DMRtable[,1]==features[gene,1]),]
    
    DMR_in_range <- lapply(1:nrow(DMRs_of_interest),function(index){
      DMR <- DMRs_of_interest[index,]
      
       if(nrow(DMRs_of_interest) > 0){
        if(as.numeric(DMR[["start"]]) > features_list[gene,"TSS"]){
          foo <- abs(as.numeric(DMR[["start"]])-features_list[gene,"TSS"])
          return(foo)
        }
        
        if(as.numeric(DMR[["start"]]) < features_list[gene,"TSS"]){
          foo <- abs(features_list[gene,"TSS"]-as.numeric(DMR[["end"]]))
          return(foo)
        }
       }
       
    })
    if (nrow(DMRs_of_interest) > 0){
      names(DMR_in_range)<-DMRs_of_interest[["DMR_id"]]
      DMR_candidates <- unlist(DMR_in_range)[DMR_in_range <= treshold]
    }
    else{DMR_candidates<- NULL}
    
      return(DMR_candidates)
      
  })
  
  names(feat_indexed_DMRs)<- rownames(features_list)
  
  
  return(feat_indexed_DMRs)
  
  gc()
}

DMRs100k <- get_indexed_DMRs()
table(sapply(DMRs100k, length))
barplot(table(sapply(DMRs100k, length)))

Even with a 100k large window, almost half of our genes do not have DMRs indexed, and 25% have only one, which is a serious problem : with such a few count of DMRs indexed per genes, it is impossible to distinguish a consistent pattern and to establish relevant hypothesis.

Once the indexing done, we can vizualise, with a plot_gene_dmr function :

plot_gene_dmr<- function(selected_gene = "OTX1",
                           expr_data = trscr_lusc$data,
                           meth_study = meth_lusc,
                           dmrs_table = DMR_table,
                           DMR_map  = DMRs100k,
                           fun = mean){
    
    
    
    tmp_DMR <- dmrs_table[names(DMR_map[[selected_gene]]),c("chr","start","end","is.hyper")]
    DMRs_of_interest<- tmp_DMR[order(tmp_DMR[,"start"]),]
    
    if(nrow(DMRs_of_interest) == 0) {stop(paste0("There is no DMRs indexed for ",selected_gene,". Check your DMR_map object"))}
    
    probes_on_chr <- epic[which(as.character(epic[,1]) == DMRs_of_interest[["chr"]][1]),]
    
    
    expr_values <- sort(expr_data[selected_gene,intersect(colnames(expr_data),colnames(meth_study$data))])
    meth_values <- meth_study$data[,intersect(colnames(expr_data),colnames(meth_study$data))]
    
    
    
    
    
    
    
    
    
    tmp_DMR <- dmrs_table[names(DMR_map[[selected_gene]]),c("chr","start","end","is.hyper")]
    DMRs_of_interest<- tmp_DMR[order(tmp_DMR[,"start"]),]
    probes_on_chr <- epic[which(as.character(epic[,1]) == DMRs_of_interest[["chr"]][1]),]
    
    print("Indexing methylation values per DMR...")
    vals_per_dmr<-epimedtools::monitored_apply(t(t(rownames(DMRs_of_interest))),1, function(i){
      
      DMR<-DMRs_of_interest[i,]   
      
      
      selected_probes <- probes_on_chr[which(probes_on_chr[,"start"] >= DMR[["start"]] & probes_on_chr[,"end"] <= DMR[["end"]]),]
      selected_meth_values <- meth_values[intersect(rownames(selected_probes),rownames(meth_values)),]
      
      
      if(class(selected_meth_values) != "numeric"){
        DMR_vals <- apply(selected_meth_values,2,fun,na.rm=T)}         #### Check if there is more than 2 selected probes
      
      else{DMR_vals <- rep(NA,length(selected_meth_values))}
      
      
      
      
      #### Else, returning raw values if the function is mean, else return a vector of NAs
      
      
      return(DMR_vals)
      
      
    })
    
    rownames(vals_per_dmr)<- colnames(meth_values)
    sorted_vals = vals_per_dmr[order(match(rownames(vals_per_dmr),names(expr_values))),]
    colnames(sorted_vals)<- rownames(DMRs_of_interest)
    
    
    cols1 <- epimedtools::monitored_apply(t(t(names(expr_values))),1, function(i){
      if(meth_study$exp_grp[i,14] == "tumoral"){col<- "red"}
      else {col<- "blue"}})
    
    print("Plotting...")
    
    layout(matrix(1:2,1), respect=TRUE)
    
    
    tmp = cbind(expr_values,1:length(expr_values))
    expression_plot = plot(tmp,
                           ylab = paste("Patient index ( ordered by level of expression)"),
                           xlab = paste("Expression level"),
                           main = "Expression/Transcription",
                           col = cols1,
                           xlim=c(0,20))
    legend("topleft",
           c("tumoral","normal"),
           xpd = TRUE,
           pch=20,
           col=c("red","blue"))
    
    
    colors=c("cyan", "black", "red")
    cols = colorRampPalette(colors)(100)
    breaks <- c(seq(0,0.33,length=35),seq(0.34,0.66,length=33),seq(0.67,1,length=33))
    
    image(t(sorted_vals),
          col=cols,
          breaks = breaks,
          axes=FALSE,
          main = paste0("# of DMRs : ",ncol(vals_per_dmr)))
    
    mtext(paste(noquote(selected_gene),"DMRs vizualisation",sep= " "), outer=TRUE,  cex=2, line=-2)
    
    
    return(sorted_vals)
    
    
    
}

plot_gene_dmr("PKMYT1")

plot_gene_dmr("CDT1")

plot_gene_dmr("ESPL1")

The color gradient used is defined by the breaks object : from 0 to 0.33, a gradient of cyan is used, from 0.34 to 0.66 is used a black gradient and from 0.66 to 1 is used a red gradient. The function also returns the table of values used to plot the heatmap, giving DMRs names and a more precise overview. Note that the fun used to aggregate values can be chosed, and is mean by default.

Yet, i didn’t find any obvious patterns of transcription/methylation correlated variations.

3 Updated genes expression profile

We first divided our genes into 6 bins (-2500,-1500,-500,TSS,+500,+1500,+2500) to overview methylation measures over it. Then, was proposed a second approach by dividing genes into biological regions such as exons, introns etc. The following work flow logically from it : downstream the TSS, the biological regions approach is conserved. Upstream the TSS, the gene is separated into 3 bins : -2500:-1500, -1500:-500 and -500:TSS, to refine our vizualisation.

3.1 Plot per genes

Here are some outputs of a plot_binreg function. For clarity, the large source code is not given here and is available on github. The examples plots are the same as used in the DMRs vignette on purpose.

As you mate note, the closest region to the TSS is sometimes overlapping with the TSS closest bin. It can lead to index a probe in both. This overlap is on purpose since borders of biological regions can be blurred, i thought it was smarter to risk some noise than determine for good the matter.

4 Wide database heatmaps

4.1 Heatmaps superup

We first define a get_indexed_binreg indexing probes as shown in previous plots.

get_indexed_binreg<-function(features_list = features,
                             epimed = epic,
                             platform_regions = platform,
                             data = meth_lusc$data){
  
  pf_chr_colname <- "seqnames"
  pf_pos_colname <- "start"
  chrs <- unique(features_list[, 1])
  chrs_indexed_epic <- lapply(chrs, function(chr) {
    idx <- rownames(epimed)[epimed[[pf_chr_colname]] %in% chr]
    ret <- epimed[idx, ]
    return(ret)
  })
  names(chrs_indexed_epic) <- chrs
  
  
  
  print(noquote(paste0("Indexing probes per bins...")))
  
  
  
  tmp_bins <- epimedtools::monitored_apply(features_list, 1, function(gene) {
    
    
    chr <- gene[[1]]
    meth_platform <- chrs_indexed_epic[[chr]]
    tmp_probes <- dmprocr::get_probe_names(gene, meth_platform, pf_chr_colname, pf_pos_colname, 7500, 7500)
    sub_epic <- meth_platform[tmp_probes, c(pf_chr_colname, pf_pos_colname)]
    
    tmp_gene <- gene
    tmp_gene[[6]] <- "+"
    if (gene[[6]] == "+") {
      tmp_gene[[2]] <- as.numeric(tmp_gene[[2]]) - 2500
    } else {
      tmp_gene[[2]] <- as.numeric(tmp_gene[[3]]) + 1000
    }
    tmp_u_bin <- 0
    tmp_d_bin <- 1500
    bin1 <- dmprocr::get_probe_names(tmp_gene, sub_epic, pf_chr_colname, pf_pos_colname, tmp_u_bin, tmp_d_bin)
    
    
    
    tmp_gene <- gene
    tmp_gene[[6]] <- "+"
    if (gene[[6]] == "+") {
      tmp_gene[[2]] <- as.numeric(tmp_gene[[2]]) - 1000
    } else {
      tmp_gene[[2]] <- as.numeric(tmp_gene[[3]]) + 500
    }
    tmp_u_bin <- 0
    tmp_d_bin <- 500
    bin2 <- dmprocr::get_probe_names(tmp_gene, sub_epic, pf_chr_colname, pf_pos_colname, tmp_u_bin, tmp_d_bin)
    
    tmp_gene <- gene
    tmp_gene[[6]] <- "+"
    if (gene[[6]] == "+") {
      tmp_gene[[2]] <- as.numeric(tmp_gene[[2]]) - 500
    } else {
      tmp_gene[[2]] <- as.numeric(tmp_gene[[3]]) + 0
    }
    tmp_u_bin <- 0
    tmp_d_bin <- 500
    bin3 <- dmprocr::get_probe_names(tmp_gene, sub_epic, pf_chr_colname, pf_pos_colname, tmp_u_bin, tmp_d_bin)
    
    
    ret <- list(bin1 = bin1,
                bin2 = bin2,
                bin3 = bin3)
    
    return(ret)
    
  })
  
  bins <- apply(t(t(names(tmp_bins))),1,function(gene){
    lapply(tmp_bins[[gene]],unlist,recursive = FALSE)
  })
  names(bins)<-rownames(features_list)
  
  
  
  regions_name <- c("INTER","CDS","INTRON", "UTR5","UTR3")
  
  regions <- epimedtools::monitored_apply(t(t(rownames(features_list))),mod=20,1, function(gene){
    print(noquote(paste0("Indexing probes per regions for ",gene,"...")))
    
    
    TSS <- features_list[gene,"TSS"]
    
    
    tmp_probes <- feat_indexed_probes[[gene]]
    tmp_pf <- platform_regions[which(platform_regions[,"gene"]==gene),]
    sub_epic <- epimed[intersect(tmp_probes,rownames(data)),c("start","end")]
    
    if(features_list[gene,"strand"] == "+"){
      pf_gene <- tmp_pf[which(tmp_pf[,"end"] > TSS),]
    }
    
    if(features_list[gene,"strand"] == "-"){
      pf_gene <- tmp_pf[which(tmp_pf[,"start"] < TSS),]
    }
    
    
    
    pf_regions = lapply(regions_name, function(region){
      regions<-pf_gene[which(pf_gene[,"genomic_feature"]==region),]
      return(regions)
    })
    
    names(pf_regions) <- regions_name
    
    
    
    
    INTER = vector()
    P2000 = vector()
    UTR5 = vector()
    INTRON = vector()
    CDS = vector()
    UTR3 = vector()
    
    for (i in 1:nrow(sub_epic)){
      
      if (nrow(pf_regions$INTER)>0)
        if(sum(apply(pf_regions$INTER,1, function(window){
          sub_epic[i,"start"] > as.numeric(window[["start"]]) & sub_epic[i,"end"] < as.numeric(window[["end"]]) }),na.rm=T)>=1)
        {INTER[i]<-rownames(sub_epic[i,])}
      
      
      
      if (nrow(pf_regions$UTR5)>0)
        if(sum(apply(pf_regions$UTR5,1, function(window){
          sub_epic[i,"start"] > as.numeric(window[["start"]]) & sub_epic[i,"end"] < as.numeric(window[["end"]]) }),na.rm=T)>=1)
        {UTR5[i]<-rownames(sub_epic[i,])}
      
      
      
      if (nrow(pf_regions$INTRON)>0)
        if(sum(apply(pf_regions$INTRON,1, function(window){
          sub_epic[i,"start"] > as.numeric(window[["start"]]) & sub_epic[i,"end"] < as.numeric(window[["end"]]) }),na.rm=T)>=1)
        {INTRON[i]<-rownames(sub_epic[i,])}
      
      
      if (nrow(pf_regions$CDS)>0)
        if(sum(apply(pf_regions$CDS,1, function(window){
          sub_epic[i,"start"] > as.numeric(window[["start"]]) & sub_epic[i,"end"] < as.numeric(window[["end"]]) }),na.rm=T)>=1) 
        {CDS[i]<-rownames(sub_epic[i,])}
      
      if (nrow(pf_regions$UTR3)>0)
        if(sum(apply(pf_regions$UTR3,1, function(window){
          sub_epic[i,"start"] > as.numeric(window[["start"]]) & sub_epic[i,"end"] < as.numeric(window[["end"]]) }),na.rm=T)>=1)
        {UTR3[i]<-rownames(sub_epic[i,])}
      
      INTER <- INTER[!is.na(INTER)]
      INTRON <- INTRON[!is.na(INTRON)]
      CDS <- CDS[!is.na(CDS)]
      UTR3 <- UTR3[!is.na(UTR3)]
      UTR5 <- UTR5[!is.na(UTR5)]
      
      
    }
    
    ret <- list(sub_epic = sub_epic, INTER = INTER, UTR5 = UTR5, CDS = CDS, INTRON = INTRON, UTR3 = UTR3)
    return(ret)
    
  })
  names(regions)<- rownames(features_list)
  
  
  feat_indexed_probes_binreg <- apply(t(t(rownames(features_list))),1,function(gene){
    tmp <- unlist(c(regions[gene],bins[gene]),recursive=FALSE,use.names=FALSE)
    names(tmp)<-c(names(regions[[gene]]),names(bins[[gene]]))
    return(tmp)
  })
  names(feat_indexed_probes_binreg)<- rownames(features_list)
  
  return(feat_indexed_probes_binreg)
  
}
feat_ind <- get_indexed_binreg()

We can now repeat our usual pipeline to visualize obtained results. It is noticeable that Rsd heatmap ( relative squared error), defined as \(\frac{\sigma}{| \mu |} \in [0,+\infty[\), is not well really informative because of its limits properties which are hard to deal with in a algorithmic way.

4.1.1 Tumoral tissues

map_binreg<-reduce_map(feat_ind,c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))


means_per_regions_per_genes_per_patient_t <- reduce_rows(meth_tumoral,map_binreg, mean ,na.rm=T)
means_t <- subset_vals_per_bins(data = meth_tumoral,
                             values_per_patient = means_per_regions_per_genes_per_patient_t,
                             fun = mean,
                             binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))

meth_heatmap(means_t, main = "mean of means superup/tumoral")

sd_per_regions_per_genes_per_patient_t <- reduce_rows(meth_tumoral,map_binreg, sd ,na.rm=T)
sds_t <- subset_vals_per_bins(data = meth_tumoral,
                              values_per_patient = sd_per_regions_per_genes_per_patient_t,
                              fun = sd,
                              binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))
meth_heatmap(sds_t, main = "mean of sd superup/tumoral")

rsd_per_regions_per_genes_per_patient_t <- reduce_rows(meth_tumoral,map_binreg, rsd ,na.rm=T)
rsds_t <- subset_vals_per_bins(data = meth_tumoral,
                            values_per_patient = rsd_per_regions_per_genes_per_patient_t,
                            fun = rsd,
                            binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))
meth_heatmap(rsds_t, main = "mean of rsd superup/tumoral")

4.1.2 Healthy tissues

means_per_regions_per_genes_per_patient_h<- reduce_rows(meth_normal,map_binreg, mean ,na.rm=T)
means_h <- subset_vals_per_bins(data = meth_normal,
                              values_per_patient = means_per_regions_per_genes_per_patient_h,
                              fun = mean,
                              binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))

meth_heatmap(means_h, main = "mean of means superup/healthy")

sd_per_regions_per_genes_per_patient_h <- reduce_rows(meth_normal,map_binreg, sd ,na.rm=T)
sds_h <- subset_vals_per_bins(data = meth_normal,
                            values_per_patient = sd_per_regions_per_genes_per_patient_h,
                            fun = sd,
                            binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))
meth_heatmap(sds_h, main = "mean of sd superup/healthy")

rsd_per_regions_per_genes_per_patient_h <- reduce_rows(meth_normal,map_binreg, rsd ,na.rm=T)
rsds_h <- subset_vals_per_bins(data = meth_normal,
                             values_per_patient = rsd_per_regions_per_genes_per_patient_h,
                             fun = rsd,
                             binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))
meth_heatmap(rsds_h, main = "mean of rsd superup/healthy")

boxplot_res(means_h,means_t)

boxplot_res(sds_h,sds_t)

boxplot_res(rsds_h,rsds_t)

4.1.3 Differential values

means_per_regions_per_genes_per_patient_d<- reduce_rows(meth_diff,map_binreg, mean ,na.rm=T)
means_d <- subset_vals_per_bins(data = meth_diff,
                              values_per_patient = means_per_regions_per_genes_per_patient_d,
                              fun = mean,
                              binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))

meth_heatmap(means_d, main = "mean of means superup/differential")

sd_per_regions_per_genes_per_patient_d <- reduce_rows(meth_diff,map_binreg, sd ,na.rm=T)
sds_d <- subset_vals_per_bins(data = meth_diff,
                            values_per_patient = sd_per_regions_per_genes_per_patient_d,
                            fun = sd,
                            binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))
meth_heatmap(sds_d, main = "mean of sd superup/differential")

rsd_per_regions_per_genes_per_patient_d <- reduce_rows(meth_diff,map_binreg, rsd ,na.rm=T)
rsds_d <- subset_vals_per_bins(data = meth_diff,
                             values_per_patient = rsd_per_regions_per_genes_per_patient_d,
                             fun = rsd,
                             binlist=c("bin1","bin2","bin3","INTER","UTR5","INTRON","CDS","UTR3"))
meth_heatmap(rsds_d, main = "mean of rsd superup/differential")

rsds_d

4.2 Heatmaps superdown

4.2.1 Tumoral tissues

4.2.2 Healthy tissues

4.2.3 Boxplots

4.2.4 Differential values

4.3 Heatmap superconserved

4.3.1 Tumoral tissues

4.3.2 Healthy tissues

4.3.3 Boxplots

4.3.4 Differential values